To predict the concrete strength using the data available in file concrete_data.xls. Apply feature engineering and model tuning to obtain 80% to 95% of R2score
Data Characteristics:
The actual concrete compressive strength (MPa) for a given mixture under a specific age (days) was determined from laboratory. Data is in raw form (not scaled).
Summary Statistics:
Number of instances (observations): 1030 Number of Attributes: 9 Attribute breakdown: 8 quantitative input variables, and 1 quantitative output variable Missing Attribute Values: None
Variable Information on CSV file:
Given is the variable name, variable type, the measurement unit and a brief description. The concrete compressive strength is the regression problem. The order of this listing corresponds to the order of numerals along the rows of the database.
Name -- Data Type -- Measurement -- Description
Data source: https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/
https://github.com/GreatLearningAIML1/gl-pgp-aiml-uta-intl-may20-ssetty3.git
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
%matplotlib inline
df_concrete = pd.read_csv('concrete.csv')
df_concrete.head(10)
Input parameters :
output Parameter:
# check missing values
print ('The missing values are >>> \n', df_concrete.isnull().sum())
# Describe the data and check missing values
print('The data Distribution >>>> \n', df_concrete.describe(),"\n ---------------- ")
# pandas_profiling
from pandas_profiling import ProfileReport
profile = ProfileReport(df_concrete, title="Cement Pandas Profiling Report")
profile.to_file("Cement Pandas Profiling Report.html")
profile
print('The shape of < concrete > given data set is >> ', df_concrete.shape)
# Univariate Analysis
cols = [i for i in df_concrete.columns if i not in 'compressive_strength']
length = len(cols)
cs = ["b","r","g","c","m","k","lime","c"]
fig = plt.figure(figsize=(13,25))
for i,j,k in itertools.zip_longest(cols,range(length),cs):
plt.subplot(4,2,j+1)
ax = sns.distplot(df_concrete[i],color=k,rug=True)
# ax.set_facecolor("w")
plt.axvline(df_concrete[i].mean(),linestyle="dashed",label="mean",color="k")
plt.legend(loc="best")
plt.title(i,)
plt.xlabel("")
plt.figure(figsize=(13,6))
sns.distplot(df_concrete["strength"],rug=True)
plt.title("Compressivee strength distribution")
plt.show()
# Bivariate Analysis
sns.pairplot(df_concrete)
plt.show()
plt.figure(figsize = (20,20))
sns.heatmap(df_concrete.corr(), annot=True,linewidths=0.2,annot_kws={'size':11})
Observation : From the above co-relation matix the Strength has strong co-relation between
And same influnce is studied below via scatter plot.
fig = plt.figure(figsize=(13,8))
ax = fig.add_subplot(111)
plt.scatter(df_concrete["ash"],df_concrete["cement"],
c=df_concrete["strength"],s=df_concrete["strength"]*3,
linewidth=1,edgecolor="k",cmap="viridis")
ax.set_facecolor("w")
ax.set_xlabel("Fly Ash in kg")
ax.set_ylabel("cement in kg")
lab = plt.colorbar()
lab.set_label("compressive_strength")
plt.title("scatter plot between cement and water")
plt.grid(True,alpha=.3)
plt.show()
Observation : To achive higher compressive strenght, better results are obtained with less fly and on model limit ash to 125kg.
fig = plt.figure(figsize=(13,8))
ax = fig.add_subplot(111)
plt.scatter(df_concrete["age"],df_concrete["water"],
c=df_concrete["strength"],s=df_concrete["strength"]*3,
linewidth=1,edgecolor="k",cmap="viridis")
ax.set_facecolor("w")
ax.set_xlabel("age in days")
ax.set_ylabel("water in Kg")
lab = plt.colorbar()
lab.set_label("compressive_strength")
plt.title("scatter plot between cement and water")
plt.grid(True,alpha=.3)
plt.show()
Observation: Better Compressive strenght are obtained with water less than 200 kg with curing time of 200 days on samples.
fig = plt.figure(figsize=(13,8))
ax = fig.add_subplot(111)
plt.scatter(df_concrete["superplastic"],df_concrete["ash"],
c=df_concrete["strength"],s=df_concrete["strength"]*3,
linewidth=1,edgecolor="k",cmap="viridis")
ax.set_facecolor("w")
ax.set_xlabel("superplastic in kg")
ax.set_ylabel("ash in Kg")
lab = plt.colorbar()
lab.set_label("compressive_strength")
plt.title("scatter plot between cement and water")
plt.grid(True,alpha=.3)
plt.show()
Observation : The strength higher ash less than 125 kg and 25 kg of superplastic
fig = plt.figure(figsize=(13,8))
ax = fig.add_subplot(111)
plt.scatter(df_concrete["slag"],df_concrete["superplastic"],
c=df_concrete["strength"],s=df_concrete["strength"]*3,
linewidth=1,edgecolor="k",cmap="viridis")
ax.set_facecolor("w")
ax.set_xlabel("Slag in kg")
ax.set_ylabel("Superplastic in Kg")
lab = plt.colorbar()
lab.set_label("compressive_strength")
plt.title("scatter plot between cement and water")
plt.grid(True,alpha=.3)
plt.show()
Observation : The strength higher slag less than 300 kg and 25 kg of superplastic
cols = [i for i in df_concrete.columns if i not in 'strength']
length = len(cols)
plt.figure(figsize=(13,27))
for i,j in itertools.zip_longest(cols,range(length)):
plt.subplot(4,2,j+1)
sns.kdeplot(df_concrete[i],
df_concrete["strength"],
cmap="hot",
shade=True)
plt.title(i+" & compressive_strength",color="navy")
The List of Techniques, comments that would be tried ,
#Objective is to achive max ~ 200 days base on the age v/s water graph and co- relation map.
sns.boxplot(df_concrete['age'])
from scipy.stats import zscore
df_concrete[['age']].mean()
df_concrete['age_outliers'] = df_concrete['age']
df_concrete['age_outliers']= zscore(df_concrete['age_outliers'])
condition1 = (df_concrete['age_outliers']>3.5) | (df_concrete['age_outliers']<-3.0 )
df_concrete1 = df_concrete.drop(df_concrete[condition1].index, axis = 0, inplace = False)
df_concrete2 = df_concrete1.drop('age_outliers', axis=1)
sns.boxplot(df_concrete2['age'])
print('The shape of the data frame original Rows*Columns is = ', df_concrete.shape , "\n--------------------------------------")
print('The shape of the data frame after setting boundary to 200 days Rows*Columns is = ', df_concrete2.shape , "\n--------------------------------------")
#Here useing going to use StandardScaler to scale data.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#scaling of target feature will not change anything as they already have values like 0 and 1 only.
df_concrete3= scaler.fit_transform(df_concrete2)
df_sconcrete = pd.DataFrame(df_concrete3)
df_sconcrete.columns=['cement','slag','ash','water','superplastic','coarseagg',
'fineagg','age','strength']
plt.figure(figsize=(13,6))
sns.distplot(df_sconcrete["strength"],rug=True)
plt.title("Compressivee strength distribution")
plt.show()
Obsetvation: Based on the current data model , we can chose simple train and test data split
# Data Split
# independant variables
X = df_concrete.drop(['strength'], axis=1) ## haveing errors.
# the dependent variable
y = df_concrete[['strength']]
# Spliting X and y into training and test set in 70:30 ratio
from sklearn.model_selection import train_test_split
train_X,test_X,train_Y,test_Y = train_test_split(X, y, test_size=0.30, random_state=1) ## haveing errors.
np.where(X.values >= np.finfo(np.float64).max)
from sklearn.preprocessing import StandardScaler
feature_scaler = StandardScaler()
train_X = feature_scaler.fit_transform(train_X)
test_X = feature_scaler.transform(test_X)
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import roc_curve
print('X variable shapes',"\n", train_X.shape)
print(test_X.shape,"\n-----------------")
print('Y Variable shapes',"\n",train_Y.shape)
print(test_Y.shape)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
target = "strength"
def model(algorithm,dtrainx,dtrainy,dtestx,dtesty,of_type):
print (algorithm)
print ("********************************")
algorithm.fit(dtrainx,dtrainy)
prediction = algorithm.predict(dtestx)
prediction = pd.DataFrame(prediction)
cross_val = cross_val_score(algorithm,dtrainx,dtrainy,cv=20,scoring="neg_mean_squared_error")
cross_val = cross_val.ravel()
print ("CROSS VALIDATION SCORE")
print ("************************")
print ("cv-mean :",cross_val.mean())
print ("cv-std :",cross_val.std())
print ("cv-max :",cross_val.max())
print ("cv-min :",cross_val.min())
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model(lr,train_X,train_Y,test_X,test_Y,"coef")
print("************************","\n")
print("Trainig accuracy",lr.score(train_X,train_Y),"\n")
print("Testing accuracy",lr.score(test_X, test_Y),"\n***********************")
from sklearn.ensemble import AdaBoostRegressor
adb = AdaBoostRegressor()
model(adb,train_X,train_Y,test_X,test_Y,"feat")
print("************************","\n")
print("Trainig accuracy",adb.score( train_X, train_Y),"\n")
print("Testing accuracy",adb.score( test_X, test_Y ),"\n***********************")
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
model(gbr,train_X,train_Y,test_X,test_Y,"feat")
print("************************","\n")
print("Trainig accuracy",adb.score( train_X,train_Y),"\n")
print("Testing accuracy",adb.score( test_X, test_Y),"\n***********************")
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
model(rf,train_X,train_Y,test_X,test_Y,"feat")
print("************************","\n")
print("Trainig accuracy",rf.score ( train_X,train_Y ),"\n")
print( "Testing accuracy",rf.score( test_X, test_Y),"\n***********************")
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint
from time import time
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn import svm, datasets
# RandomSearch forest better fit model.
#simple performance reporting function
def clf_performance(classifier, model_name):
print(model_name)
print('Best Score: ' + str(classifier.best_score_))
print('Best Parameters: ' + str(classifier.best_params_))
rf = RandomForestClassifier(random_state = 1)
parameters = {'n_estimators': [100,125,150,175,200,225,250],
'criterion': ['gini', 'entropy'],
'max_depth': [2,4,6,8,10],
'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
'class_weight': [0.2,0.4,0.6,0.8,1.0],
'min_samples_split': [2,3,4,5,6,7]}
clf_rf = GridSearchCV(rf, param_grid = parameters, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf = clf_rf.fit (train_X,train_Y)
clf_performance(best_clf_rf,'Random Forest')
## Not sure why model fit is haveing error, exploring this error.
## Approaching with a breakdown to fit a model.
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=300, random_state=0)
from sklearn.model_selection import cross_val_score
all_accuracies = cross_val_score(estimator=classifier, X=train_X, y=train_Y, cv=5)
print(all_accuracies.mean())